Data Cleaning
#import data
library(readr)
games_data <- read_csv("vgsales-12-4-2019-short.csv",
col_types = cols(Rank = col_integer())) #change Rank variable to integer class
#change different variables to factor class
games_data$Genre <- as.factor(games_data$Genre)
games_data$ESRB_Rating <- as.factor(games_data$ESRB_Rating)
games_data$Platform <- as.factor(games_data$Platform)
games_data$Publisher <- as.factor(games_data$Publisher)
games_data$Developer <- as.factor(games_data$Developer)
games_data$Year <- as.factor(games_data$Year)
#rename the levels of the ESRB Rating variable and order them
#KA (Kids to Adults) rating was changed to Everyone
library(plyr)
games_data$ESRB_Rating <- revalue(games_data$ESRB_Rating, c("E"="Everyone", "E10"="Everyone10+", "T"="Teen", "M"="Mature", "AO" = "AdultsOnly", "RP"="RatingPending", "KA"="Everyone", "EC"="EarlyChildhood"))
games_data$ESRB_Rating <- ordered(games_data$ESRB_Rating, levels = c("EarlyChildhood", "Everyone", "Everyone10+", "Teen", "Mature", "RatingPending", "AdultsOnly"))
#Change levels of Publisher and Developer variables called Unknown to missing
levels(games_data$Publisher)[levels(games_data$Publisher)=='Unknown'] <- NA
levels(games_data$Developer)[levels(games_data$Developer)=='Unknown'] <- NA
#Rename a couple Publisher and Developer levels
games_data$Publisher <- revalue(games_data$Publisher,
c("Microsoft Game Studios"="Microsoft Studios",
"Valve"="Valve Corporation", "Valve Software"="Valve Corporation", #all Valve Corporation
"Sony Computer Entertainment"="Sony Interactive Entertainment")) #all Sony Interactive Entertainment
games_data$Developer <- revalue(games_data$Developer, c("Microsoft Game Studios"="Microsoft Studios", "Valve"="Valve Corporation", "Valve Software"="Valve Corporation", "Sony Computer Entertainment"="Sony Interactive Entertainment"))
#Rename the sub-genres to their overall Genre
games_data$Genre <- revalue(games_data$Genre, c("Platform"="Action", "Shooter"="Action", "Fighting"="Action", "Visual Novel"="Adventure", "Racing"="Sports"))
#order genres by similarities for graphing
games_data$Genre <- ordered(games_data$Genre, levels = c("Action", "Action-Adventure", "Adventure", "Role-Playing", "Simulation", "Strategy", "Sports", "Board Game", "Puzzle", "Education", "Music", "MMO", "Party", "Misc"))
#revalue factors of Platform to be their overall platform
library(forcats)
games_data$Platform <- fct_collapse(games_data$Platform, Xbox = c("X360","XB", "XBL", "XOne"),
Nintendo = c("Wii","WiiU", "WW", "N64", "GC", "NES", "3DS", "DS", "DSi", "DSiW", "GB", "GBA", "GBC", "SNES", "NS", "VB", "VC"),
PlayStation = c("PS", "PS2", "PS3", "PS4", "PSP", "PSV", "PSN"),
Atari = c("2600", "5200", "7800", "AJ", "AST", "Lynx"),
PC = c("PC", "OSX", "BRW", "Linux"),
Mobile = c("And", "iOS", "WinP"),
Sega = c("GEN", "SAT", "GG", "SCD", "DC", "MSX", "WS", "S32X", "MS"),
Other = c("BBCM", "ACPC", "Amig", "ApII", "C128", "C64", "FMT", "NG", "3DO", "Aco", "CD32", "Arc", "CDi", "CD32", "GIZ", "Int", "iQue", "Mob", "NGage", "Ouya", "PCFX", "PCE", "TG16", "ZXS", "MSD"))
Data Transformations
#Create variable that merges Total_Shipped and Global_Sales to represent Total Sales
#and to have less missing data for a model
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
games_data$total_sales <- coalesce(games_data$Total_Shipped, games_data$Global_Sales)
data0 <- games_data[complete.cases(games_data$total_sales),] #21242 observations
#exclude sales of 0
data1<- data0%>% filter(total_sales > 0)# 19862 observations
#plot histogram and qq plot of total sales
par(mfrow=c(1,2)) #put graphs side by side
library(rcompanion)
plotNormalHistogram(data1$total_sales, col="lightblue3", linecol="red", lwd=2,
xlab="Total Sales (Millions)") #plot histogram of total sales
qqnorm(data1$total_sales) #q-q plot of total sales
qqline(data1$total_sales, col="red") #add line to q-q plot

#plot log transformation of total sales
logTotal = log(data1$total_sales) #log transformation of total sales
data1 <- cbind(data1, logTotal) #add column to new dataset with logTotal
#plot histogram and q-q plot of log of total sales
par(mfrow=c(1,2)) #put graphs side by side
plotNormalHistogram(logTotal, col="lightblue3", linecol="red", lwd=2,
xlab="Log of Total Sales") #plot histogram of log of sales
qqnorm(data1$logTotal) #q-q plot of total sales
qqline(data1$logTotal, col="red") #add line to q-q plot

data_log <- full_join(games_data, data1)
## Joining, by = c("Rank", "Name", "Genre", "ESRB_Rating", "Platform", "Publisher", "Developer", "Critic_Score", "User_Score", "Total_Shipped", "Global_Sales", "NA_Sales", "PAL_Sales", "JP_Sales", "Other_Sales", "Year", "total_sales")
# subset of data without missing values in critic score
data2 <- games_data[complete.cases(games_data$Critic_Score),] #6536 observations
#plot histogram and qq plot of critic score
par(mfrow=c(2,2)) #put graphs side by side
plotNormalHistogram(data2$Critic_Score, col="lightblue3", linecol="red", lwd=2,
xlab="Critic Score") #plot histogram of critic scores
qqnorm(data2$Critic_Score) #q-q plot of critic score
qqline(data2$Critic_Score, col="red") #add line to q-q plot
#plot log transformation of critic score
logCritic = log(data2$Critic_Score) #log transformation of critic score
#plot histogram and q-q plot of log of critic score
plotNormalHistogram(logCritic, col="lightblue3", linecol="red", lwd=2,
xlab="Log of Critic Scores") #plot histogram of log of critic scores
qqnorm(logCritic) #q-q plot of critic score
qqline(logCritic, col="red") #add line to q-q plot

# subset of data without missing values in user score
data3 <- games_data[complete.cases(games_data$User_Score),] #335 observations
#plot histogram and qq plot of user score
par(mfrow=c(2,2)) #put graphs side by side
plotNormalHistogram(data3$User_Score, col="lightblue3", linecol="red", lwd=2,
xlab="User Scores") #plot histogram of user scores
qqnorm(data3$User_Score) #q-q plot of user score
qqline(data2$User_Score, col="red") #add line to q-q plot
#plot log transformation of user score
logUser = log(data3$User_Score) #log transformation of user score
#plot histogram and q-q plot of log of user score
plotNormalHistogram(logUser, col="lightblue3", linecol="red", lwd=2,
xlab="Log of User Scores") #plot histogram of log of user scores
qqnorm(logUser) #q-q plot of user score
qqline(logUser, col="red") #add line to q-q plot

# subset of data without missing values in NA Sales (North America)
dataNAS <- games_data[complete.cases(games_data$NA_Sales),] #12964 observations
dataJP <- games_data[complete.cases(games_data$JP_Sales),] #7043 observations
dataPAL <- games_data[complete.cases(games_data$PAL_Sales),] #13189 observations
dataOther <- games_data[complete.cases(games_data$Other_Sales),] #15522 observations
#exclude sales of 0
dataNAS <- dataNAS%>% filter(NA_Sales > 0) #12679 observations
dataJP <- dataJP%>% filter(JP_Sales > 0) #6618 observations
dataPAL <- dataPAL%>% filter(PAL_Sales > 0) #10911 observations
dataOther <- dataOther%>% filter(Other_Sales > 0) #10277 observations
par(mfrow=c(2,2)) #put graphs side by side
plotNormalHistogram(dataNAS$NA_Sales, col="lightblue3", linecol="red", lwd=2,
xlab="North American Sales (in millions)") #plot histogram of NA sales
plotNormalHistogram(dataJP$JP_Sales, col="lightblue3", linecol="red", lwd=2,
xlab="Japan Sales (in millions)") #plot histogram of JP sales
plotNormalHistogram(dataPAL$PAL_Sales, col="lightblue3", linecol="red", lwd=2,
xlab="European Sales (in millions)") #plot histogram of PAL sales
plotNormalHistogram(dataOther$Other_Sales, col="lightblue3", linecol="red", lwd=2,
xlab="Sales in the rest of the world(in millions)") #plot histogram of Other sales

#find log of all geographic sales variables
logNA = log(dataNAS$NA_Sales) #log transformation of NA sales
logJP = log(dataJP$JP_Sales) #log transformation of JP sales
logPAL = log(dataPAL$PAL_Sales) #log transformation of PAL sales
logOther = log(dataOther$Other_Sales) #log transformation of Other sales
#plot all histograms of log transformed sales
par(mfrow=c(2,2)) #put graphs side by side
plotNormalHistogram(logNA, col="lightblue3", linecol="red", lwd=2,
xlab="Log of North American Sales") #plot histogram of log of NA sales
plotNormalHistogram(logJP, col="lightblue3", linecol="red", lwd=2,
xlab="Log of Japan Sales") #plot histogram of log of JP sales
plotNormalHistogram(logPAL, col="lightblue3", linecol="red", lwd=2,
xlab="Log of European Sales") #plot histogram of log of PAL sales
plotNormalHistogram(logOther, col="lightblue3", linecol="red", lwd=2,
xlab="Log of Sales in the rest of the world") #plot histogram of log of Other sales

#add log data to data set data_log with log columns
dataNAS <- cbind(dataNAS, logNA)
dataJP <- cbind(dataJP, logJP)
dataPAL <- cbind(dataPAL, logPAL)
dataOther <- cbind(dataOther, logOther)
data_log <- full_join(data_log, dataNAS)
## Joining, by = c("Rank", "Name", "Genre", "ESRB_Rating", "Platform", "Publisher", "Developer", "Critic_Score", "User_Score", "Total_Shipped", "Global_Sales", "NA_Sales", "PAL_Sales", "JP_Sales", "Other_Sales", "Year", "total_sales")
data_log <- full_join(data_log, dataJP)
## Joining, by = c("Rank", "Name", "Genre", "ESRB_Rating", "Platform", "Publisher", "Developer", "Critic_Score", "User_Score", "Total_Shipped", "Global_Sales", "NA_Sales", "PAL_Sales", "JP_Sales", "Other_Sales", "Year", "total_sales")
data_log <- full_join(data_log, dataPAL)
## Joining, by = c("Rank", "Name", "Genre", "ESRB_Rating", "Platform", "Publisher", "Developer", "Critic_Score", "User_Score", "Total_Shipped", "Global_Sales", "NA_Sales", "PAL_Sales", "JP_Sales", "Other_Sales", "Year", "total_sales")
data_log <- full_join(data_log, dataOther)
## Joining, by = c("Rank", "Name", "Genre", "ESRB_Rating", "Platform", "Publisher", "Developer", "Critic_Score", "User_Score", "Total_Shipped", "Global_Sales", "NA_Sales", "PAL_Sales", "JP_Sales", "Other_Sales", "Year", "total_sales")
Scores vs. Log of Total Sales
attach(data_log)
## The following objects are masked _by_ .GlobalEnv:
##
## logJP, logNA, logOther, logPAL, logTotal
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#plot Critic Score vs Total Sales, colored by ESRB Rating
#4706 observations
criticPlot <- plot_ly(data=data_log, x = ~Critic_Score, y = ~logTotal, color = ~Platform, type="scatter", mode = "markers", colorscale='contour')
criticPlot <- criticPlot %>% layout(title = "Critic Score vs. Log of Total Sales by Platform",
xaxis = list(title = "Critic Score"),
yaxis = list(title = "Log of Total Sales (Millions)"))
#plot User Score vs Total Sales, colored by ESRB Rating
#238 observations
userPlot <- plot_ly(data=data_log, x = ~User_Score, y = ~logTotal, color = ~Platform, type="scatter", mode = "markers")
userPlot <- userPlot %>% layout(title = "User Score vs. Log of Total Sales by Platform",
xaxis = list(title = "User Score"),
yaxis = list(title = "Log of Total Sales (Millions)"))
criticPlot
## Warning: Ignoring 51086 observations
## Warning: 'scatter' objects don't have these attributes: 'colorscale'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'stackgroup', 'orientation', 'groupnorm', 'stackgaps', 'text', 'texttemplate', 'hovertext', 'mode', 'hoveron', 'hovertemplate', 'line', 'connectgaps', 'cliponaxis', 'fill', 'fillcolor', 'marker', 'selected', 'unselected', 'textposition', 'textfont', 'r', 't', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## Warning: 'scatter' objects don't have these attributes: 'colorscale'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'stackgroup', 'orientation', 'groupnorm', 'stackgaps', 'text', 'texttemplate', 'hovertext', 'mode', 'hoveron', 'hovertemplate', 'line', 'connectgaps', 'cliponaxis', 'fill', 'fillcolor', 'marker', 'selected', 'unselected', 'textposition', 'textfont', 'r', 't', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## Warning: 'scatter' objects don't have these attributes: 'colorscale'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'stackgroup', 'orientation', 'groupnorm', 'stackgaps', 'text', 'texttemplate', 'hovertext', 'mode', 'hoveron', 'hovertemplate', 'line', 'connectgaps', 'cliponaxis', 'fill', 'fillcolor', 'marker', 'selected', 'unselected', 'textposition', 'textfont', 'r', 't', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## Warning: 'scatter' objects don't have these attributes: 'colorscale'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'stackgroup', 'orientation', 'groupnorm', 'stackgaps', 'text', 'texttemplate', 'hovertext', 'mode', 'hoveron', 'hovertemplate', 'line', 'connectgaps', 'cliponaxis', 'fill', 'fillcolor', 'marker', 'selected', 'unselected', 'textposition', 'textfont', 'r', 't', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## Warning: 'scatter' objects don't have these attributes: 'colorscale'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'stackgroup', 'orientation', 'groupnorm', 'stackgaps', 'text', 'texttemplate', 'hovertext', 'mode', 'hoveron', 'hovertemplate', 'line', 'connectgaps', 'cliponaxis', 'fill', 'fillcolor', 'marker', 'selected', 'unselected', 'textposition', 'textfont', 'r', 't', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
userPlot
## Warning: Ignoring 55554 observations
cor.test(data_log$Critic_Score, data_log$logTotal)
##
## Pearson's product-moment correlation
##
## data: data_log$Critic_Score and data_log$logTotal
## t = 29.891, df = 4704, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3752343 0.4232635
## sample estimates:
## cor
## 0.3995231
cor.test(data_log$User_Score, data_log$logTotal)
##
## Pearson's product-moment correlation
##
## data: data_log$User_Score and data_log$logTotal
## t = 5.8491, df = 236, p-value = 1.638e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2395013 0.4620801
## sample estimates:
## cor
## 0.3558262
Different Variables vs. Total Sales
ESRB Ratings
plot_ly(data_log, y = ~logTotal, color = ~ESRB_Rating, type = "box", text = ~paste("Game: ", Name, '<br>Year:', Year)) %>%
layout(title = "ESRB Rating by Log of Total Sales",
yaxis = list(title = "Log of Total Sales",
zeroline = FALSE),
xaxis = list(title = "ESRB Rating",
zeroline = FALSE))
## Warning: Ignoring 35930 observations
Genre
plot_ly(data_log, y = ~logTotal, color = ~Genre, type = "box", text = ~paste("Game: ", Name, '<br>Year:', Year)) %>%
layout(title = "Genre by Log of Total Sales",
yaxis = list(title = "Log of Total Sales",
zeroline = FALSE),
xaxis = list(title = "Genre",
zeroline = FALSE))
## Warning: Ignoring 35930 observations
Region Sales
# data_log_year <- data_log[complete.cases(data_log$Year, data_log$logNA, data_log$logPAL, data_log$logJP, data_log$logOther, data_log$logTotal),]
library(reshape2)
# data_log_year %>%
# select(Year, logNA, logPAL, logJP,
# logOther, logTotal) %>%
# melt(id.vars = "Year") %>%
# group_by(Year, variable) %>%
# summarise(logTotal = sum(value)) %>%
# ggplot(aes(x = Year, y = logTotal, color = variable, group = variable)) +
# geom_point() +
# geom_line() +
# labs(x = "Year", y = "Log of Total Sales", color = "Region") +
# theme(axis.text.x = element_text(angle = 90),
# panel.background = element_rect(),
# panel.grid.major = element_blank(),
# panel.grid.minor = element_blank())
#
games_data1 <- games_data[complete.cases(games_data$Year, games_data$NA_Sales, games_data$PAL_Sales, games_data$JP_Sales, games_data$Other_Sales, games_data$total_sales),]
games_data1 %>%
select(Year, NA_Sales, PAL_Sales, JP_Sales,
Other_Sales, total_sales) %>%
melt(id.vars = "Year") %>%
group_by(Year, variable) %>%
summarise(total_sales = sum(value)) %>%
ggplot(aes(x = Year, y = total_sales, color = variable, group = variable)) +
geom_point() +
geom_line() +
labs(x = "Year", y = "Mean Sales (in Millions)", color = "Region") +
theme(axis.text.x = element_text(angle = 90),
panel.background = element_rect(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())

#correlation plot of continuous sales variables
library(corrplot)
## corrplot 0.84 loaded
par(mfrow=c(1,2)) #plot side by side
corrData <- as.matrix(data_log[, c(18,19,20,21,22)]) #subset that only sales variables
corrData <- na.omit(corrData)
correl <- cor(corrData) #find correlations of all the variables
corrplot(correl, method="number", type = "upper") #plot correlation numbers
corrplot(correl, method="circle", type = "upper") #plot correlation circles
